********************************************************************************
* 01_clean_ets_data.do
* Clean ETS/Praxis Data and Create Test Difficulty Index (TDI)
*
* Inputs:
*   - data/raw/ets/cleaned ppst and core and composite.dta
*   - data/raw/ets/states_we_add_back_in.xlsx
*
* Outputs:
*   - data/cleaned/ets_treatment_data.dta
*   - data/cleaned/ets_treatment_data.xlsx
********************************************************************************

clear all
set more off

display "========================================"
display "01: Cleaning ETS Data"
display "========================================"

* ──────────────────────────────────────────────────────────────────────────────
* 1. Load and Combine ETS Data
* ──────────────────────────────────────────────────────────────────────────────

display "Loading main ETS data..."
use "data/raw/ets/cleaned ppst and core and composite.dta", clear
capture drop eftotlt

* Save variable list before append
tempfile main_data
save `main_data'

* Load additional states (ND, OR, TN)
display "Loading additional states (ND, OR, TN)..."
import excel "data/raw/ets/states_we_add_back_in.xlsx", firstrow clear
tostring ID, replace
gen z_score = (passingscore - test_mean) / test_sd

* Append to main data
append using `main_data'

display "  Combined dataset: " _N " observations"

* ──────────────────────────────────────────────────────────────────────────────
* 2. Fix PA Math Cutoff Changes
* ──────────────────────────────────────────────────────────────────────────────

display "Fixing PA math cutoff..."
* PA math: 150 in 2016, then 142 from 2017+
replace passingscore = 150 if State == "PA" & year >= 2016 & subject == "math"
replace passingscore = 142 if State == "PA" & year >= 2017 & subject == "math"
replace passingscore_composite = 150 if State == "PA" & year >= 2016 & subject == "math"
replace passingscore_composite = 142 if State == "PA" & year >= 2017 & subject == "math"

* ──────────────────────────────────────────────────────────────────────────────
* 3. Fill Forward AR, CT, DE 2018/2020 Values from 2016
* ──────────────────────────────────────────────────────────────────────────────

display "Filling forward AR, CT, DE values..."

* For each state and subject, fill 2018 and 2020 with 2016 values
foreach st in "AR" "CT" "DE" {
    foreach sub in "math" "read" "write" {
        * Get 2016 passingscore
        quietly summarize passingscore if State == "`st'" & year == 2016 & subject == "`sub'"
        if r(N) > 0 {
            local ps_2016 = r(mean)
            replace passingscore = `ps_2016' if State == "`st'" & subject == "`sub'" & inlist(year, 2018, 2020) & passingscore == .
        }
        * Get 2016 passingscore_composite
        quietly summarize passingscore_composite if State == "`st'" & year == 2016 & subject == "`sub'"
        if r(N) > 0 {
            local psc_2016 = r(mean)
            replace passingscore_composite = `psc_2016' if State == "`st'" & subject == "`sub'" & inlist(year, 2018, 2020) & passingscore_composite == .
        }
    }
}

* Recompute z-scores after fixes
replace z_score = (passingscore - test_mean) / test_sd

* ──────────────────────────────────────────────────────────────────────────────
* 4. Expand to Include Odd Years
* ──────────────────────────────────────────────────────────────────────────────

display "Expanding to odd years..."

* Save current data
tempfile even_years
save `even_years'

* 2009 = copy of 2008
use `even_years', clear
keep if year == 2008
replace year = 2009
tempfile y2009
save `y2009'

* 2011 = copy of 2010
use `even_years', clear
keep if year == 2010
replace year = 2011
tempfile y2011
save `y2011'

* 2013 = copy of 2010
use `even_years', clear
keep if year == 2010
replace year = 2013
tempfile y2013
save `y2013'

* 2015 = copy of 2016
use `even_years', clear
keep if year == 2016
replace year = 2015
tempfile y2015
save `y2015'

* 2017 = copy of 2016, with PA math fix
use `even_years', clear
keep if year == 2016
replace year = 2017
replace passingscore = 142 if State == "PA" & subject == "math"
replace passingscore_composite = 142 if State == "PA" & subject == "math"
replace z_score = (passingscore - test_mean) / test_sd
tempfile y2017
save `y2017'

* 2019 = copy of 2018
use `even_years', clear
keep if year == 2018
replace year = 2019
tempfile y2019
save `y2019'

* Combine all years
use `even_years', clear
append using `y2009'
append using `y2011'
append using `y2013'
append using `y2015'
append using `y2017'
append using `y2019'

sort State year subject time

* Fix SC 2019 composite score
replace passingscore_composite = 158 if State == "SC" & year == 2019

display "  Expanded dataset: " _N " observations"

* ──────────────────────────────────────────────────────────────────────────────
* 5. Create Z-Scores and Test Difficulty Index (TDI)
* ──────────────────────────────────────────────────────────────────────────────

display "Computing z-scores and TDI..."

* Recompute z-scores for all observations
replace z_score = (passingscore - test_mean) / test_sd
gen z_score_composite = (passingscore_composite - test_mean) / test_sd

* Compute TDI = average of 3 subject z-scores per state-year
* First sum z-scores within state-year
bysort State year: egen z_score_total = total(z_score)
bysort State year: egen z_composite_total = total(z_score_composite)

gen test_index = z_score_total / 3
gen test_index_composite = z_composite_total / 3

* ──────────────────────────────────────────────────────────────────────────────
* 6. Collapse to One Row Per State-Year (Treatment Data)
* ──────────────────────────────────────────────────────────────────────────────

display "Creating treatment data..."

* Keep one row per state-year (use math row)
keep if subject == "math"

* Keep only needed variables
keep State year test_index test_index_composite time

* ──────────────────────────────────────────────────────────────────────────────
* 7. Create Treatment Variables (Delta TDI)
* ──────────────────────────────────────────────────────────────────────────────

sort State year

* Treatment year = 2013 (PPST -> Core transition)
gen treatment_year = 2013
gen time_till = year - treatment_year

* Create Delta TDI = TDI(2014) - TDI(2012) for each state
gen test_index_lead1 = .
gen test_index_composite_lead1 = .

* Get TDI at 2012 for each state (the "lead" of the 2014 value)
bysort State (year): replace test_index_lead1 = test_index[_n-1] if year == 2014
bysort State (year): replace test_index_composite_lead1 = test_index_composite[_n-1] if year == 2014

* Compute continuous treatment amount (Delta TDI)
gen continuous_treat = test_index - test_index_lead1 if year == 2014
gen continuous_composite_treat = test_index_composite - test_index_composite_lead1 if year == 2014

* Fill Delta TDI to all years within each state
bysort State (year): replace continuous_treat = continuous_treat[_n-1] if missing(continuous_treat)
gsort State -year
bysort State: replace continuous_treat = continuous_treat[_n-1] if missing(continuous_treat)
sort State year

bysort State (year): replace continuous_composite_treat = continuous_composite_treat[_n-1] if missing(continuous_composite_treat)
gsort State -year
bysort State: replace continuous_composite_treat = continuous_composite_treat[_n-1] if missing(continuous_composite_treat)
sort State year

* ──────────────────────────────────────────────────────────────────────────────
* 8. Create Event Study Interaction Variables
* ──────────────────────────────────────────────────────────────────────────────

display "Creating event study variables..."

* Lead/lag variables: year_XXXX = Delta TDI * I(year == XXXX)
* These are pre-computed interactions for the event study specification

* Enrollment event study (biennial)
foreach y in 2008 2010 2012 2014 2016 2018 {
    gen year_`y' = 0
    replace year_`y' = continuous_treat if year == `y'
}
* Set reference year to zero
replace year_2012 = 0

* Graduation event study (annual)
foreach y in 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 {
    capture gen year_`y' = 0
    capture replace year_`y' = continuous_treat if year == `y'
}
* Reference year already set
replace year_2012 = 0

* Also create lead/lag named variables (matching R output column names)
foreach i of numlist 5(-1)1 {
    local yr = 2013 - `i'
    gen lead`i' = 0
    replace lead`i' = continuous_treat if time_till == -`i'
    gen composite_lead`i' = 0
    replace composite_lead`i' = continuous_composite_treat if time_till == -`i'
}
forvalues i = 0/7 {
    gen lag`i' = 0
    replace lag`i' = continuous_treat if time_till == `i'
    gen composite_lag`i' = 0
    replace composite_lag`i' = continuous_composite_treat if time_till == `i'
}

* ──────────────────────────────────────────────────────────────────────────────
* 9. Save Output
* ──────────────────────────────────────────────────────────────────────────────

display "Saving treatment data..."

* Ensure output directory exists
capture mkdir "data/cleaned"

* Save as .dta (native Stata)
save "data/cleaned/ets_treatment_data.dta", replace

* Also export as .xlsx (for R compatibility)
export excel using "data/cleaned/ets_treatment_data.xlsx", firstrow(variables) replace

* Report summary
display ""
display "  Created: data/cleaned/ets_treatment_data.dta"
display "  Created: data/cleaned/ets_treatment_data.xlsx"
quietly tab State
display "  States: " r(r)
quietly summarize year
display "  Years: " r(min) " - " r(max)
quietly summarize continuous_treat
display "  Average Delta TDI: " %6.3f r(mean)

display ""
display "01_clean_ets_data.do complete."
